/////////////////////////////////////////////////////////////////
// Sequence.h
//
// Class for reading/manipulating single sequence character data.
/////////////////////////////////////////////////////////////////

#ifndef SEQUENCE_H
#define SEQUENCE_H

#include <string>
#include <fstream>
#include <iostream>
#include <cctype>
#include <cstdlib>
#include "SafeVector.h"
#include "FileBuffer.h"

namespace PROBCONS {
  /////////////////////////////////////////////////////////////////
  // Sequence
  //
  // Class for storing sequence information.
  /////////////////////////////////////////////////////////////////

  class Sequence {

    bool isValid;                // a boolean indicating whether the sequence data is valid or not
    string header;               // string containing the comment line of the FASTA file
    SafeVector<char> *data;      // pointer to character data
    int length;                  // length of the sequence
    int sequenceLabel;           // integer sequence label, typically to indicate the ordering of sequences
    //   in a Multi-FASTA file
    int inputLabel;              // position of sequence in original input

    /////////////////////////////////////////////////////////////////
    // Sequence::Sequence()
    //
    // Default constructor.  Does nothing.
    /////////////////////////////////////////////////////////////////

    Sequence () : isValid (false), header (""), data (NULL), length (0), sequenceLabel (0), inputLabel (0) {}

  public:

    /////////////////////////////////////////////////////////////////
    // Sequence::Sequence()
    //
    // Constructor.  Reads the sequence from a FileBuffer.
    /////////////////////////////////////////////////////////////////

    Sequence (FileBuffer &infile, bool stripGaps = false) : isValid (false), header ("~"), data (NULL), length(0), sequenceLabel (0), inputLabel (0) {

      // read until the first non-blank line
      while (!infile.eof()){
	infile.GetLine (header);
	if (header.length() != 0) break;
      }

      // check to make sure that it is a correct header line
      if (header[0] == '>'){

	// if so, remove the leading ">"
	header = header.substr (1);

	// remove any leading or trailing white space in the header comment
	while (header.length() > 0 && isspace (header[0])) header = header.substr (1);
	while (header.length() > 0 && isspace (header[header.length() - 1])) header = header.substr(0, header.length() - 1);

	// get ready to read the data[] array; note that data[0] is always '@'
	char ch;
	data = new SafeVector<char>; assert (data);
	data->push_back ('@');

	// get a character from the file
	while (infile.Get(ch)){

	  // if we've reached a new comment line, put the character back and stop
	  if (ch == '>'){ infile.UnGet(); break; }

	  // skip whitespace
	  if (isspace (ch)) continue;

	  // substitute gap character
	  if (ch == '.') ch = '-';
	  if (stripGaps && ch == '-') continue;

	  // check for known characters
	  if (!((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '*' || ch == '-')){
	    cerr << "ERROR: Unknown character encountered: " << ch << endl;
	    exit (1);
	  }

	  // everything's ok so far, so just store this character.
	  data->push_back(ch);
	  ++length;
	}

	// sequence must contain data in order to be valid
	isValid = length > 0;
	if (!isValid){
	  delete data;
	  data = NULL;
	}
      }
    }

  
    /////////////////////////////////////////////////////////////////
    // Sequence::Sequence()
    //
    // Constructor.  Builds a sequence from existing data.  Note
    // that the data must use one-based indexing where data[0] should
    // be set to '@'.
    /////////////////////////////////////////////////////////////////

    Sequence (SafeVector<char> *data, string header, int length, int sequenceLabel, int inputLabel) :
      isValid (data != NULL), header(header), data(data), length (length), sequenceLabel (sequenceLabel), inputLabel (inputLabel) {
      assert (data);
      assert ((*data)[0] == '@');
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::Sequence()
    //
    // Destructor.  Release allocated memory.
    /////////////////////////////////////////////////////////////////

    ~Sequence (){
      if (data){
	assert (isValid);
	delete data;
	data = NULL;
	isValid = false;
      }
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::GetHeader()
    //
    // Return the string comment associated with this sequence.
    /////////////////////////////////////////////////////////////////

    string GetHeader () const {
      return header;
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::GetName()
    //
    // Return the first word of the string comment associated with this sequence.
    /////////////////////////////////////////////////////////////////

    string GetName () const {
      char name[1024];
      sscanf (header.c_str(), "%s", name);
      return string(name);
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::GetDataPtr()
    //
    // Return the iterator to data associated with this sequence.
    /////////////////////////////////////////////////////////////////

    SafeVector<char>::iterator GetDataPtr(){
      assert (isValid);
      assert (data);
      return data->begin();
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::GetPosition()
    //
    // Return the character at position i.  Recall that the character
    // data is stored with one-based indexing.
    /////////////////////////////////////////////////////////////////

    char GetPosition (int i) const {
      assert (isValid);
      assert (data);
      assert (i >= 1 && i <= length);
      return (*data)[i];
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::SetLabel()
    //
    // Sets the sequence label to i.
    /////////////////////////////////////////////////////////////////

    void SetLabel (int i){
      assert (isValid);
      sequenceLabel = i;
      inputLabel = i;
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::SetSortLabel()
    //
    // Sets the sequence sorting label to i.
    /////////////////////////////////////////////////////////////////

    void SetSortLabel (int i){
      assert (isValid);
      sequenceLabel = i;
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::GetLabel()
    //
    // Retrieves the input label.
    /////////////////////////////////////////////////////////////////

    int GetLabel () const {
      assert (isValid);
      return inputLabel;
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::GetSortLabel()
    //
    // Retrieves the sorting label.
    /////////////////////////////////////////////////////////////////

    int GetSortLabel () const {
      assert (isValid);
      return sequenceLabel;
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::Fail()
    //
    // Checks to see if the sequence successfully loaded.
    /////////////////////////////////////////////////////////////////

    bool Fail () const {
      return !isValid;
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::Length()
    //
    // Returns the length of the sequence.
    /////////////////////////////////////////////////////////////////

    int GetLength () const {
      assert (isValid);
      assert (data);
      return length;
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::WriteMFA()
    //
    // Writes the sequence to outfile in MFA format.  Uses numColumns
    // columns per line.  If useIndex is set to false, then the
    // header is printed as normal, but if useIndex is true, then
    // ">S###" is printed where ### represents the sequence label.
    /////////////////////////////////////////////////////////////////

    void WriteMFA (ostream &outfile, int numColumns, bool useIndex = false) const {
      assert (isValid);
      assert (data);
      assert (!outfile.fail());

      // print out heading
      if (useIndex)
	outfile << ">S" << GetLabel() << endl;
      else
	outfile << ">" << header << endl;

      // print out character data
      int ct = 1;
      for (; ct <= length; ct++){
	outfile << (*data)[ct];
	if (ct % numColumns == 0) outfile << endl;
      }
      if ((ct-1) % numColumns != 0) outfile << endl;
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::Clone()
    //
    // Returns a new deep copy of the seqeuence.
    /////////////////////////////////////////////////////////////////

    Sequence *Clone () const {
      Sequence *ret = new Sequence();
      assert (ret);

      ret->isValid = isValid;
      ret->header = header;
      ret->data = new SafeVector<char>; assert (ret->data);
      *(ret->data) = *data;
      ret->length = length;
      ret->sequenceLabel = sequenceLabel;
      ret->inputLabel = inputLabel;

      return ret;
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::GetRange()
    //
    // Returns a new sequence object consisting of a range of
    // characters from the current seuquence.
    /////////////////////////////////////////////////////////////////

    Sequence *GetRange (int start, int end) const {
      Sequence *ret = new Sequence();
      assert (ret);

      assert (start >= 1 && start <= length);
      assert (end >= 1 && end <= length);
      assert (start <= end);

      ret->isValid = isValid;
      ret->header = header;
      ret->data = new SafeVector<char>; assert (ret->data);
      ret->data->push_back ('@');
      for (int i = start; i <= end; i++)
	ret->data->push_back ((*data)[i]);
      ret->length = end - start + 1;
      ret->sequenceLabel = sequenceLabel;
      ret->inputLabel = inputLabel;

      return ret;
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::AddGaps()
    //
    // Given an SafeVector<char> containing the skeleton for an
    // alignment and the identity of the current character, this
    // routine will create a new sequence with all necesssary gaps added.
    // For instance,
    //    alignment = "XXXBBYYYBBYYXX"
    //    id = 'X'
    // will perform the transformation
    //    "ATGCAGTCA" --> "ATGCC---GT--CA"
    //                    (XXXBBYYYBBYYXX)
    /////////////////////////////////////////////////////////////////

    Sequence *AddGaps (SafeVector<char> *alignment, char id){
      Sequence *ret = new Sequence();
      assert (ret);

      ret->isValid = isValid;
      ret->header = header;
      ret->data = new SafeVector<char>; assert (ret->data);
      ret->length = (int) alignment->size();
      ret->sequenceLabel = sequenceLabel;
      ret->inputLabel = inputLabel;
      ret->data->push_back ('@');

      SafeVector<char>::iterator dataIter = data->begin() + 1;
      for (SafeVector<char>::iterator iter = alignment->begin(); iter != alignment->end(); ++iter){
	if (*iter == 'B' || *iter == id){
	  ret->data->push_back (*dataIter);
	  ++dataIter;
	}
	else
	  ret->data->push_back ('-');
      }

      return ret;
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::GetString()
    //
    // Returns the sequence as a string with gaps removed.
    /////////////////////////////////////////////////////////////////

    string GetString (){
      string s = "";
      for (int i = 1; i <= length; i++){
	if ((*data)[i] != '-') s += (*data)[i];
      }
      return s;
    }


    /////////////////////////////////////////////////////////////////
    // Sequence::GetMapping()
    //
    // Returns a SafeVector<int> containing the indices of every
    // character in the sequence.  For instance, if the data is
    // "ATGCC---GT--CA", the method returns {1,2,3,4,5,9,10,13,14}.
    /////////////////////////////////////////////////////////////////

    SafeVector<int> *GetMapping () const {
      SafeVector<int> *ret = new SafeVector<int>(1, 0);
      for (int i = 1; i <= length; i++){
	if ((*data)[i] != '-') ret->push_back (i);
      }
      return ret;
    }

    /////////////////////////////////////////////////////////////////
    // Sequence::Highlight()
    //
    // Changes all positions with score >= cutoff to upper case and
    // all positions with score < cutoff to lower case.
    /////////////////////////////////////////////////////////////////

    void Highlight (const SafeVector<float> &scores, const float cutoff){
      for (int i = 1; i <= length; i++){
	if (scores[i-1] >= cutoff)
	  (*data)[i] = toupper ((*data)[i]);
	else
	  (*data)[i] = tolower ((*data)[i]);
      }
    }
  };
}
#endif
